{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>26.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>35.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age     Sex Embarked Survived\n",
       "0  22.0    male        S       No\n",
       "1  38.0  female        C      Yes\n",
       "2  26.0  female        S      Yes\n",
       "3  35.0  female        S      Yes\n",
       "4  35.0    male        S       No"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "data = pd.read_csv(r\"Na.csv\"\n",
    "                   ,index_col=0\n",
    "                  )#index_col=0将第0列作为索引，不写则认为第0列为特征\n",
    " \n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder \n",
    "y = data.iloc[:,-1]                         #要输入的是标签，不是特征矩阵，所以允许一维"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0           No\n",
       "1          Yes\n",
       "2          Yes\n",
       "3          Yes\n",
       "4           No\n",
       "        ...   \n",
       "886         No\n",
       "887        Yes\n",
       "888         No\n",
       "889    Unknown\n",
       "890         No\n",
       "Name: Survived, Length: 891, dtype: object"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['No', 'Unknown', 'Yes'], dtype=object)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "le = LabelEncoder()                         #实例化\n",
    "le = le.fit(y)                              #导入数据\n",
    "label = le.transform(y)                     #transform接口调取结果\n",
    "label\n",
    "le.classes_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>26.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>35.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age     Sex Embarked  Survived\n",
       "0  22.0    male        S         0\n",
       "1  38.0  female        C         2\n",
       "2  26.0  female        S         2\n",
       "3  35.0  female        S         2\n",
       "4  35.0    male        S         0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.iloc[:,-1]=label\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "data_ = data.copy()\n",
    "data_.dropna(axis=0,inplace=True)\n",
    "data_.head()\n",
    "OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 1., 0., 0., 1.],\n",
       "       [1., 0., 1., 0., 0.],\n",
       "       [1., 0., 0., 0., 1.],\n",
       "       ...,\n",
       "       [1., 0., 0., 0., 1.],\n",
       "       [0., 1., 1., 0., 0.],\n",
       "       [0., 1., 0., 1., 0.]])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "data = data_.copy()\n",
    "X = data.iloc[:,1:-1]\n",
    " \n",
    "enc = OneHotEncoder(categories='auto').fit(X)\n",
    "result = enc.transform(X).toarray()\n",
    "result\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Female</th>\n",
       "      <th>Male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>26.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  Survived  Female  Male  Embarked_C  Embarked_Q  Embarked_S\n",
       "0  22.0       0.0     0.0   1.0         0.0         0.0         1.0\n",
       "1  38.0       2.0     1.0   0.0         1.0         0.0         0.0\n",
       "2  26.0       2.0     1.0   0.0         0.0         0.0         1.0\n",
       "3  35.0       2.0     1.0   0.0         0.0         0.0         1.0\n",
       "4  35.0       0.0     0.0   1.0         0.0         0.0         1.0"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#依然可以直接一步到位，但为了给大家展示模型属性，所以还是写成了三步\n",
    "OneHotEncoder(categories='auto').fit_transform(X).toarray()\n",
    " \n",
    "#依然可以还原\n",
    "pd.DataFrame(enc.inverse_transform(result))\n",
    " \n",
    "enc.get_feature_names()#返回每一个经过哑变量后生成稀疏矩阵列的名字\n",
    " \n",
    "result\n",
    "result.shape\n",
    " \n",
    "#axis=1,表示跨行进行合并，也就是将两表左右相连，如果是axis=0，就是将量表上下相连\n",
    "newdata = pd.concat([data,pd.DataFrame(result)],axis=1)\n",
    " \n",
    "newdata.head()\n",
    " \n",
    "newdata.drop([\"Sex\",\"Embarked\"],axis=1,inplace=True)\n",
    " \n",
    "newdata.columns = [\"Age\",\"Survived\",\"Female\",\"Male\",\"Embarked_C\",\"Embarked_Q\",\"Embarked_S\"]\n",
    " \n",
    "newdata.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>1.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Age     Sex Embarked  Survived\n",
       "0  0.0    male        S         0\n",
       "1  1.0  female        C         2\n",
       "2  0.0  female        S         2\n",
       "3  1.0  female        S         2\n",
       "4  1.0    male        S         0"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#将年龄二值化\n",
    " \n",
    "data_2 = data.copy()\n",
    " \n",
    "from sklearn.preprocessing import Binarizer\n",
    "X = data_2.iloc[:,0].values.reshape(-1,1)               #类为特征专用，所以不能使用一维数组\n",
    "transformer = Binarizer(threshold=30).fit_transform(X)\n",
    " \n",
    "data_2.iloc[:,0] = transformer\n",
    "data_2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
